# Load in the whole Alzheimers data file
data <- read.csv('~/repos/portfolio/demonstrative/R/datasets/alzheimers/alzheimers.csv')
# Remove this field ... I forgot to do that when creating the dataset
data <- data %>% select(-male)
# Normalize gender labels
stopifnot(all(!is.na(data$gender)))
normalize.gender <- function(x) {
is.male <- x %>% tolower %>% str_detect('^m')
ifelse(is.male, 'Male', 'Female') %>% factor
}
data <- data %>% mutate(gender=normalize.gender(gender))
# Convert integer fields to numeric for the sake of consistency
data <- data %>% mutate_each_(funs(as.numeric), c('Betacellulin', 'Eotaxin_3'))
head(data)
X <- data %>% select(-response)
y <- data[,'response']
table(y)
y
Impaired NotImpaired
91 242
table(y) / length(y)
y
Impaired NotImpaired
0.2732733 0.7267267
names(X)
[1] "ACE_CD143_Angiotensin_Converti" "ACTH_Adrenocorticotropic_Hormon"
[3] "AXL" "Adiponectin"
[5] "Alpha_1_Antichymotrypsin" "Alpha_1_Antitrypsin"
[7] "Alpha_1_Microglobulin" "Alpha_2_Macroglobulin"
[9] "Angiopoietin_2_ANG_2" "Angiotensinogen"
[11] "Apolipoprotein_A_IV" "Apolipoprotein_A1"
[13] "Apolipoprotein_A2" "Apolipoprotein_B"
[15] "Apolipoprotein_CI" "Apolipoprotein_CIII"
[17] "Apolipoprotein_D" "Apolipoprotein_E"
[19] "Apolipoprotein_H" "B_Lymphocyte_Chemoattractant_BL"
[21] "BMP_6" "Beta_2_Microglobulin"
[23] "Betacellulin" "C_Reactive_Protein"
[25] "CD40" "CD5L"
[27] "Calbindin" "Calcitonin"
[29] "CgA" "Clusterin_Apo_J"
[31] "Complement_3" "Complement_Factor_H"
[33] "Connective_Tissue_Growth_Factor" "Cortisol"
[35] "Creatine_Kinase_MB" "Cystatin_C"
[37] "EGF_R" "EN_RAGE"
[39] "ENA_78" "Eotaxin_3"
[41] "FAS" "FSH_Follicle_Stimulation_Hormon"
[43] "Fas_Ligand" "Fatty_Acid_Binding_Protein"
[45] "Ferritin" "Fetuin_A"
[47] "Fibrinogen" "GRO_alpha"
[49] "Gamma_Interferon_induced_Monokin" "Glutathione_S_Transferase_alpha"
[51] "HB_EGF" "HCC_4"
[53] "Hepatocyte_Growth_Factor_HGF" "I_309"
[55] "ICAM_1" "IGF_BP_2"
[57] "IL_11" "IL_13"
[59] "IL_16" "IL_17E"
[61] "IL_1alpha" "IL_3"
[63] "IL_4" "IL_5"
[65] "IL_6" "IL_6_Receptor"
[67] "IL_7" "IL_8"
[69] "IP_10_Inducible_Protein_10" "IgA"
[71] "Insulin" "Kidney_Injury_Molecule_1_KIM_1"
[73] "LOX_1" "Leptin"
[75] "Lipoprotein_a" "MCP_1"
[77] "MCP_2" "MIF"
[79] "MIP_1alpha" "MIP_1beta"
[81] "MMP_2" "MMP_3"
[83] "MMP10" "MMP7"
[85] "Myoglobin" "NT_proBNP"
[87] "NrCAM" "Osteopontin"
[89] "PAI_1" "PAPP_A"
[91] "PLGF" "PYY"
[93] "Pancreatic_polypeptide" "Prolactin"
[95] "Prostatic_Acid_Phosphatase" "Protein_S"
[97] "Pulmonary_and_Activation_Regulat" "RANTES"
[99] "Resistin" "S100b"
[101] "SGOT" "SHBG"
[103] "SOD" "Serum_Amyloid_P"
[105] "Sortilin" "Stem_Cell_Factor"
[107] "TGF_alpha" "TIMP_1"
[109] "TNF_RII" "TRAIL_R3"
[111] "TTR_prealbumin" "Tamm_Horsfall_Protein_THP"
[113] "Thrombomodulin" "Thrombopoietin"
[115] "Thymus_Expressed_Chemokine_TECK" "Thyroid_Stimulating_Hormone"
[117] "Thyroxine_Binding_Globulin" "Tissue_Factor"
[119] "Transferrin" "Trefoil_Factor_3_TFF3"
[121] "VCAM_1" "VEGF"
[123] "Vitronectin" "von_Willebrand_Factor"
[125] "age" "tau"
[127] "p_tau" "Ab_42"
[129] "Genotype" "gender"
X %>% sapply(class) %>% table
.
factor numeric
2 128
#X %>% sapply(class) %>% .[. == 'factor']
names(X)[sapply(X, class) == 'factor']
[1] "Genotype" "gender"
We could start by looking at the relationship between some of the more intuitive variables like Age, Gender, and Genotype and impairment, to see if there are any obvious relationships.
data %>%
mutate(age_range=cut(age, breaks=5)) %>%
group_by(age_range, response) %>% tally %>%
plot_ly(x=~age_range, y=~n, color=~response, type='bar') %>%
plotly::layout(hovermode='closest', title='Age vs Impairment')
minimal value for n is 3, returning requested palette with 3 different levels
minimal value for n is 3, returning requested palette with 3 different levels
data %>%
group_by(Genotype, response) %>% tally %>%
plot_ly(x=~Genotype, y=~n, color=~response, type='bar') %>%
plotly::layout(hovermode='closest', title='Genotype vs Impairment')
minimal value for n is 3, returning requested palette with 3 different levels
minimal value for n is 3, returning requested palette with 3 different levels
data %>%
group_by(gender, response) %>% tally %>%
plot_ly(x=~gender, y=~n, color=~response, type='bar') %>%
layout(hovermode='closest', title='Gender vs Impairment')
minimal value for n is 3, returning requested palette with 3 different levels
minimal value for n is 3, returning requested palette with 3 different levels
Now what about the other ~125 variables? We can’t look at them all one at a time so perhaps there is a way to “condense” them into something more manageable:
library(corrplot)
d.pca <- X %>% select(-gender, -Genotype)
cor.mat <- corrplot(cor(d.pca), order='hclust', tl.col='black', tl.cex=.5)
# Extract the variable names from the figure below since they will be useful
# for creating similar plots for comparison (and it's easier to compare in the same order)
cor.var <- rownames(cor.mat)
#d.pca <- X %>% select(-gender, -Genotype)
pca <- prcomp(d.pca, scale=T)
data.frame(Cumulative.Variance=summary(pca)$importance[3,]) %>% mutate(PC.Number=1:n()) %>%
plot_ly(x=~PC.Number, y=~Cumulative.Variance, mode='lines', type='scatter') %>%
layout(title='Cumulative Variance Explained by Principal Components')
corrplot(cor(d.pca[,cor.var], pca$x[,1:25]), tl.col='black', tl.cex=.5)
#registerCores(1)
tr <- proj$getTrainer()
tc <- trainControl(
classProbs=T, method='cv', number=10,
summaryFunction = function(...)c(twoClassSummary(...), defaultSummary(...)),
verboseIter=F, savePredictions='final', returnResamp='final', allowParallel=T
)
get.ensemble.model <- function(tuneList){
caret.list.args <- list(
trControl=trainControl(
method='cv', number=5, classProbs=T,
returnData=F, savePredictions='final',
allowParallel=T, verboseIter=F
),
tuneList=tuneList
)
caret.stack.args <- list(
method=GetEnsembleAveragingModel(),
trControl=trainControl(method='none', savePredictions = 'final')
)
GetCaretEnsembleModel(caret.list.args, caret.stack.args)
}
X.m <- predict(dummyVars(~., X), X) %>% as.data.frame
non.pca.vars <- X.m %>% select(starts_with('gender'), starts_with('Genotype'), starts_with('age')) %>% names
pca.split <- function(X) {
list(
var=X %>% dplyr::select(one_of(non.pca.vars)),
pca=X %>% dplyr::select(-one_of(non.pca.vars))
)
}
pca.combine <- function(pp, X) {
X <- pca.split(X)
cbind(X$var, predict(pp, newdata=X$pca))
}
get.model <- function(model.name){
model <- getModelInfo()[[model.name]]
m <- model
m$fit <- function(x, y, wts, param, lev, last, classProbs, ...){
X <- pca.split(x)
pp <- preProcess(X$pca, method=c('center', 'scale', 'pca'), pcaComp = 40)
X.train <- cbind(X$var, predict(pp, newdata=X$pca))
modelFit <- model$fit(X.train, y, wts, param, lev, last, classProbs, ...)
modelFit$pp <- pp
modelFit$feature.names <- names(X.train)
modelFit
}
m$predict <- function (modelFit, newdata, submodels = NULL) {
X.test <- pca.combine(modelFit$pp, newdata)
model$predict(modelFit, X.test, submodels)
}
m$prob <- function (modelFit, newdata, submodels = NULL){
X.test <- pca.combine(modelFit$pp, newdata)
model$prob(modelFit, X.test, submodels)
}
if (model.name == 'xgbTree'){
m$varImp = function(object, numTrees = NULL, ...) {
imp <- xgb.importance(object$feature.names, model = object)
imp <- as.data.frame(imp)[, 1:2]
rownames(imp) <- as.character(imp[,1])
imp <- imp[,2,drop = FALSE]
colnames(imp) <- "Overall"
imp
}
}
m
}
models <- list(
tr$getModel('pca_glm', method=get.model('glm'), trControl=tc),
tr$getModel('pca_glmnet', method=get.model('glmnet'), trControl=tc, tuneLength=5),
tr$getModel('pca_rf', method=get.model('rf'), trControl=tc, tuneGrid=expand.grid(.mtry = c(2,4,8))),
tr$getModel('pca_rpart', method=get.model('rpart'), tuneLength=10, trControl=tc),
tr$getModel('pca_gbm', method=get.model('gbm'), tuneLength=5, trControl=tc, verbose=F),
tr$getModel('pca_xgb', method=get.model('xgbTree'), tuneLength=5, trControl=tc),
tr$getModel('pca_spline', method=get.model('earth'), preProcess=pre.proc, trControl=tc, tuneLength=5),
tr$getModel('pca_nnet', method=get.model('nnet'), preProcess=pre.proc, trControl=tc, tuneLength=5, trace=F)
)
names(models) <- sapply(models, function(m) m$name)
pca.results <- lapply(models, function(m) tr$train(m, X.m, y, enable.cache=T)) %>% setNames(names(models))
2016-11-07 18:36:04 INFO::Beginning training for model "pca_glm" (cache name = "model_pca_glm")
2016-11-07 18:36:04 DEBUG::Restoring cached object "~/data/meetups/r_tutorials/tutorial_03/cache/models/model_pca_glm.Rdata" from disk
2016-11-07 18:36:04 INFO::Training complete for model "pca_glm"
2016-11-07 18:36:04 INFO::Beginning training for model "pca_glmnet" (cache name = "model_pca_glmnet")
2016-11-07 18:36:04 DEBUG::Restoring cached object "~/data/meetups/r_tutorials/tutorial_03/cache/models/model_pca_glmnet.Rdata" from disk
2016-11-07 18:36:04 INFO::Training complete for model "pca_glmnet"
2016-11-07 18:36:04 INFO::Beginning training for model "pca_rf" (cache name = "model_pca_rf")
2016-11-07 18:36:04 DEBUG::Restoring cached object "~/data/meetups/r_tutorials/tutorial_03/cache/models/model_pca_rf.Rdata" from disk
2016-11-07 18:36:04 INFO::Training complete for model "pca_rf"
2016-11-07 18:36:04 INFO::Beginning training for model "pca_rpart" (cache name = "model_pca_rpart")
2016-11-07 18:36:04 DEBUG::Restoring cached object "~/data/meetups/r_tutorials/tutorial_03/cache/models/model_pca_rpart.Rdata" from disk
2016-11-07 18:36:04 INFO::Training complete for model "pca_rpart"
2016-11-07 18:36:04 INFO::Beginning training for model "pca_gbm" (cache name = "model_pca_gbm")
2016-11-07 18:36:04 DEBUG::Restoring cached object "~/data/meetups/r_tutorials/tutorial_03/cache/models/model_pca_gbm.Rdata" from disk
2016-11-07 18:36:04 INFO::Training complete for model "pca_gbm"
2016-11-07 18:36:04 INFO::Beginning training for model "pca_xgb" (cache name = "model_pca_xgb")
2016-11-07 18:36:04 DEBUG::Restoring cached object "~/data/meetups/r_tutorials/tutorial_03/cache/models/model_pca_xgb.Rdata" from disk
2016-11-07 18:36:04 INFO::Training complete for model "pca_xgb"
2016-11-07 18:36:04 INFO::Beginning training for model "pca_spline" (cache name = "model_pca_spline")
2016-11-07 18:36:04 DEBUG::Restoring cached object "~/data/meetups/r_tutorials/tutorial_03/cache/models/model_pca_spline.Rdata" from disk
2016-11-07 18:36:04 INFO::Training complete for model "pca_spline"
2016-11-07 18:36:04 INFO::Beginning training for model "pca_nnet" (cache name = "model_pca_nnet")
2016-11-07 18:36:04 DEBUG::Restoring cached object "~/data/meetups/r_tutorials/tutorial_03/cache/models/model_pca_nnet.Rdata" from disk
2016-11-07 18:36:04 INFO::Training complete for model "pca_nnet"
library(caretEnsemble)
pre.proc <- c('center', 'scale')
ens.model <- list(
glmnet=caretModelSpec(method='glmnet', preProcess=pre.proc, tuneLength=5),
gbm=caretModelSpec(method='gbm', tuneLength=5, verbose=F),
spline=caretModelSpec(method='earth', tuneLength=5, preProcess=pre.proc)
)
models <- list(
tr$getModel('glm', method='glm', preProcess=pre.proc, trControl=tc),
tr$getModel('glmnet', method='glmnet', preProcess=pre.proc, trControl=tc, tuneLength=5),
tr$getModel('nnet', method='nnet', preProcess=pre.proc, trControl=tc, tuneLength=5, trace=F),
tr$getModel('rpart', method='rpart', tuneLength=10, trControl=tc),
tr$getModel('gbm', method='gbm', tuneLength=5, trControl=tc, verbose=F),
tr$getModel('xgb', method='xgbTree', tuneLength=5, trControl=tc),
tr$getModel('spline', method='earth', preProcess=pre.proc, trControl=tc, tuneLength=5),
tr$getModel('rf', method='rf', trControl=tc, tuneLength=5),
tr$getModel('ensemble', method=get.ensemble.model(ens.model), trControl=tc, tuneLength=5)
)
names(models) <- sapply(models, function(m) m$name)
all.results <- lapply(models, function(m) tr$train(m, X.m, y, enable.cache=T)) %>% setNames(names(models))
2016-11-07 20:37:59 INFO::Beginning training for model "glm" (cache name = "model_glm")
2016-11-07 20:37:59 DEBUG::Restoring cached object "~/data/meetups/r_tutorials/tutorial_03/cache/models/model_glm.Rdata" from disk
2016-11-07 20:37:59 INFO::Training complete for model "glm"
2016-11-07 20:37:59 INFO::Beginning training for model "glmnet" (cache name = "model_glmnet")
2016-11-07 20:37:59 DEBUG::Restoring cached object "~/data/meetups/r_tutorials/tutorial_03/cache/models/model_glmnet.Rdata" from disk
2016-11-07 20:37:59 INFO::Training complete for model "glmnet"
2016-11-07 20:37:59 INFO::Beginning training for model "nnet" (cache name = "model_nnet")
2016-11-07 20:37:59 DEBUG::Restoring cached object "~/data/meetups/r_tutorials/tutorial_03/cache/models/model_nnet.Rdata" from disk
2016-11-07 20:37:59 INFO::Training complete for model "nnet"
2016-11-07 20:37:59 INFO::Beginning training for model "rpart" (cache name = "model_rpart")
2016-11-07 20:37:59 DEBUG::Restoring cached object "~/data/meetups/r_tutorials/tutorial_03/cache/models/model_rpart.Rdata" from disk
2016-11-07 20:37:59 INFO::Training complete for model "rpart"
2016-11-07 20:37:59 INFO::Beginning training for model "gbm" (cache name = "model_gbm")
2016-11-07 20:37:59 DEBUG::Restoring cached object "~/data/meetups/r_tutorials/tutorial_03/cache/models/model_gbm.Rdata" from disk
2016-11-07 20:37:59 INFO::Training complete for model "gbm"
2016-11-07 20:37:59 INFO::Beginning training for model "xgb" (cache name = "model_xgb")
2016-11-07 20:37:59 DEBUG::Restoring cached object "~/data/meetups/r_tutorials/tutorial_03/cache/models/model_xgb.Rdata" from disk
2016-11-07 20:37:59 INFO::Training complete for model "xgb"
2016-11-07 20:37:59 INFO::Beginning training for model "spline" (cache name = "model_spline")
2016-11-07 20:37:59 DEBUG::Restoring cached object "~/data/meetups/r_tutorials/tutorial_03/cache/models/model_spline.Rdata" from disk
2016-11-07 20:37:59 INFO::Training complete for model "spline"
2016-11-07 20:37:59 INFO::Beginning training for model "rf" (cache name = "model_rf")
2016-11-07 20:37:59 DEBUG::Restoring cached object "~/data/meetups/r_tutorials/tutorial_03/cache/models/model_rf.Rdata" from disk
2016-11-07 20:37:59 INFO::Training complete for model "rf"
2016-11-07 20:37:59 INFO::Beginning training for model "ensemble" (cache name = "model_ensemble")
2016-11-07 20:37:59 DEBUG::Restoring cached object "~/data/meetups/r_tutorials/tutorial_03/cache/models/model_ensemble.Rdata" from disk
2016-11-07 20:37:59 INFO::Training complete for model "ensemble"
rbind(GetResampleData(pca.results), GetResampleData(all.results)) %>%
mutate(model=reorder(model, accuracy, median)) %>%
plot_ly(x=~model, y=~accuracy, type='box') %>%
layout(margin=list(b=100))
var.imp <- GetVarImp(all.results)
var.imp %>%
mutate(feature=reorder(var.imp$feature, var.imp$score, mean)) %>%
plot_ly(x=~feature, y=~score, color=~model, mode='markers', type='scatter') %>%
layout(margin=list(b=200))
var.model.names <- names(pca.results)[!str_detect(names(pca.results), 'nnet')]
var.imp <- GetVarImp(pca.results[var.model.names])
var.imp %>%
mutate(feature=reorder(var.imp$feature, var.imp$score, mean)) %>%
plot_ly(x=~feature, y=~score, color=~model, mode='markers', type='scatter') %>%
layout(margin=list(b=200))
pd.vars <- c('tau', 'p_tau', 'Ab_42', 'age', 'gender.Male', 'gender.Female')
pd.models <- c('xgb', 'gbm', 'glmnet', 'spline', 'ensemble')
#pd.models <- c('ensemble')
pred.fun <- function(object, newdata) {
if ('caretStack' %in% class(object$finalModel)){
pred <- predict(object$finalModel, newdata=newdata, type='prob')
} else {
pred <- predict(object, newdata=newdata, type='prob')
}
if (is.vector(pred)) pred
else pred[,1]
}
options(error=recover)
#registerCores(1) # Increase this to make PD calcs faster
pd.data <- GetPartialDependence(
all.results[pd.models], pd.vars, pred.fun,
X=X.m, # This can come from model objects but only if returnData=T in trainControl
grid.size=50, grid.window=c(0, 1), # Resize these to better fit range of data
sample.rate=1, # Decrease this if PD calculations take too long
verbose=F, seed=SEED
)
pd.mean <- foreach(pd=pd.data, .combine=rbind)%do%
{ pd$pd %>% dplyr::mutate(predictor=pd$predictor, model=pd$model) } %>%
dplyr::group_by(predictor, model, x) %>%
dplyr::summarise(y.mid=mean(y)) %>% ungroup
pd.mean %>% ggplot(aes(x=x, y=y.mid, color=model)) + geom_line() + facet_wrap(~predictor, scale='free') +
theme_bw() +
ylab('Predicted Probability') + xlab('Predictor Value')
glmnet.model <- all.results$glmnet$fit$finalModel
glmnet.coef <- predict(glmnet.model, s=all.results$glmnet$fit$bestTune$lambda, type='coefficients')
glmnet.coef <- glmnet.coef[,1]
glmnet.coef %>% data.frame %>% setNames('Coefficient') %>% add_rownames(var='Feature') %>%
filter(abs(Coefficient) > 0) %>%
mutate(Feature=reorder(Feature, Coefficient, mean)) %>%
plot_ly(x=~Feature, y=~Coefficient, type='bar') %>%
layout(margin=list(b=200), title='Glmnet Coefficients')